NEW YORK TAXI ARE BUSY. Travel saleman is a big challenge for data scienctist to help customer such as UBER, Lyft moving faster from PickUp point to Dropoff Point. With Dataset trip_duration and fare_amount. This EDA help customer understand the data from various factor. Therefor we can gain the insight and prepare for travel sale man problems.

import holoviews as hv
import geoviews as gv
import param, paramnb, parambokeh
import pandas as pd
import dask.dataframe as dd
from colorcet import cm
from bokeh.models import WMTSTileSource
from holoviews.operation.datashader import datashade
from holoviews.streams import RangeXY, PlotSize
import ipywidgets as widgets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import lightgbm as lgbm
import io
import os
import gc
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
import os
# from jupyterthemes import jtplot
# jtplot.style(context='talk', fscale=1.4, spines=False)
data = pd.read_csv(filepath_or_buffer='train.csv', engine='c', infer_datetime_format=True, parse_dates=[2,3])
#df_train with litmited rows
df_train = pd.read_csv('Fare-Predictions/train.csv', nrows=50000)
df_train5k = pd.read_csv('Fare-Predictions/train.csv', nrows=5000)
train = pd.read_csv('traintrip.csv', nrows=500000)
# if used train it a big data may lead to slow computer GPU.
train.head(10)
# Clean data process
duration_mask = ((data.trip_duration < 60) | # < 1 min
(data.trip_duration > 3600 * 2)) # > 2 hours
print('Anomalies in trip duration, %: {:.2f}'.format(data[duration_mask].shape[0] / data.shape[0] * 100))
data = data[~duration_mask]
data.trip_duration = data.trip_duration.astype(np.uint16)
print('Trip duration in seconds: {} to {}'.format(data.trip_duration.min(), data.trip_duration.max()))
print('Empty trips: {}'.format(data[data.passenger_count == 0].shape[0]))
data = data[data.passenger_count > 0]
# Function for missing data
def missing_values_table(df):
mis_val = df.isnull().sum()
mis_val_percent = 100 * df.isnull().sum() /len(df)
mis_val_table = pd.concat([mis_val, mis_val_percent], axis = 1)
mis_val_table_ren_columns = mis_val_table.rename(columns={0:'Missing Values', 1:'% of Total Values'})
mis_val_table_ren_columns=mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1]!=
0].sort_values('% of Total Values',ascending=False).round(1)
print("You selected dataframe has" + str(df.shape[1]) + "columns.\n"
"There are" + str(mis_val_table_ren_columns.shape[0]) +
"columns that have missing values.")
#dataframe
return mis_val_table_ren_columns
## Function support the code
def distance(lat1, lon1, lat2, lon2):
p = 0.017453292519943295 # Pi/180
a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
return 0.6213712 * 12742 * np.arcsin(np.sqrt(a)) # 2*R*asin...
# First calculate two arrays with datapoint density per sq mile
n_lon, n_lat = 200, 200 # number of grid bins per longitude, latitude dimension
density_pickup, density_dropoff = np.zeros((n_lat, n_lon)), np.zeros((n_lat, n_lon)) # prepare arrays
BB = (-74.5, -72.8, 40.5, 41.8)
# To calculate the number of datapoints in a grid area, the numpy.digitize() function is used.
# This function needs an array with the (location) bins for counting the number of datapoints
# per bin.
bins_lon = np.zeros(n_lon+1) # bin
bins_lat = np.zeros(n_lat+1) # bin
delta_lon = (BB[1]-BB[0]) / n_lon # bin longutide width
delta_lat = (BB[3]-BB[2]) / n_lat # bin latitude height
bin_width_miles = distance(BB[2], BB[1], BB[2], BB[0]) / n_lon # bin width in miles
bin_height_miles = distance(BB[3], BB[0], BB[2], BB[0]) / n_lat # bin height in miles
for i in range(n_lon+1):
bins_lon[i] = BB[0] + i * delta_lon
for j in range(n_lat+1):
bins_lat[j] = BB[2] + j * delta_lat
# Digitize per longitude, latitude dimension
inds_pickup_lon = np.digitize(df_train.pickup_longitude, bins_lon)
inds_pickup_lat = np.digitize(df_train.pickup_latitude, bins_lat)
inds_dropoff_lon = np.digitize(df_train.dropoff_longitude, bins_lon)
inds_dropoff_lat = np.digitize(df_train.dropoff_latitude, bins_lat)
# Count per grid bin
# note: as the density_pickup will be displayed as image, the first index is the y-direction,
# the second index is the x-direction. Also, the y-direction needs to be reversed for
# properly displaying (therefore the (n_lat-j) term)
dxdy = bin_width_miles * bin_height_miles
for i in range(n_lon):
for j in range(n_lat):
density_pickup[j, i] = np.sum((inds_pickup_lon==i+1) & (inds_pickup_lat==(n_lat-j))) / dxdy
density_dropoff[j, i] = np.sum((inds_dropoff_lon==i+1) & (inds_dropoff_lat==(n_lat-j))) / dxdy
missingdf_train = missing_values_table(df_train)
missingdf_train
missingtrain = missing_values_table(train)
missingtrain
df_train = df_train.dropna(how='any', axis=0)
missing = missing_values_table(df_train)
missing
df_train
## For df_train add some importance feature from datetime for time serise analyze.
df_train['pickup_datetime'] = pd.to_datetime(df_train['pickup_datetime'])
df_train['pickup_datetime_month'] = df_train['pickup_datetime'].dt.month
df_train['pickup_datetime_year'] = df_train['pickup_datetime'].dt.year
df_train['pickup_datetime_day_of_week'] = df_train['pickup_datetime'].dt.weekday
df_train['pickup_datetime_day_of_hour'] = df_train['pickup_datetime'].dt.hour
df_trains = df_train
data.head(1)
# Check again the dataframe
correlation = np.corrcoef(df_train['fare_amount'], df_train['passenger_count'])
df_train = df_train.drop(['passenger_count'], axis = 1)
df_train = df_train.drop(['pickup_datetime'], axis =1)
df_train = df_train.drop(['key'], axis = 1)
# Features for data
data = data[data.passenger_count > 0]
#Convert this feature into categorical type
data.store_and_fwd_flag = data.store_and_fwd_flag.astype('category')
#month (pickup and dropoff)
data['mm_pickup'] = data.pickup_datetime.dt.month.astype(np.uint8)
data['mm_dropoff'] = data.dropoff_datetime.dt.month.astype(np.uint8)
#day of week
data['dow_pickup'] = data.pickup_datetime.dt.weekday.astype(np.uint8)
data['dow_dropoff'] = data.dropoff_datetime.dt.weekday.astype(np.uint8)
# day hour pickup and drop off
data['hh_pickup'] = data.pickup_datetime.dt.hour.astype(np.uint8)
data['hh_dropoff'] = data.dropoff_datetime.dt.hour.astype(np.uint8)
dow_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
plt.figure(figsize=(12,2))
datas = data.groupby('dow_pickup').aggregate({'id':'count'}).reset_index()
sns.barplot(x='dow_pickup', y='id', data=datas, )
## PLOT
plt.title('Pick-Up Passenger Weekday Distribution')
plt.xlabel('Trip Duration, minutes')
plt.xticks(range(0, 7), dow_names, rotation='horizontal')
plt.ylabel('No of Trips made')
The pickup time distribution by weekdays in a week. It seems like Friday is the most popular day to hail a taxi with close to 220, 000 trips made, while Sunday is on the other end of the spectrum with approximately 165, 000 trips.
plt.figure(figsize=(12,2))
datas1 = data.groupby('hh_pickup').aggregate({'id':'count'}).reset_index()
sns.barplot(x='hh_pickup', y='id', data=datas1)
plt.title('Pick-ups Hour Distribution')
plt.xlabel('Hour of Day, 0-23')
plt.ylabel('No of Trips made')
# plt.savefig('Figures/pickups-hour-distribution.png')
Pickup time heatmap for the results from Figure 1 and Figure 2. The most active pickup times are between 6 to 9 PM from Thursday to Sunday. The least active pickup times are between 2 to 5 PM from Wednesday to Sunday.
## Version 1 Quite Advanced.
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import plotly.io as pio
pio.templates.default = "plotly_dark"
# def HeatPlot(data, down_names):
def HeatPlot1(data, down_names):
durationsweekly = data.dow_pickup
hours = data.hh_pickup
fig = px.imshow(pd.crosstab(data.dow_pickup, data.hh_pickup, values=data.vendor_id, aggfunc='count', normalize='index'),
labels=dict(x="Time of Day", y="Weekly"))
# fig = px.imshow(data, x=data.hh_pickup, y=data.dow_pickup)
fig.update_layout(
yaxis = dict(
tickmode = 'array',
tickvals = [0, 1, 2, 3, 4, 5, 6],
ticktext = dow_names
)
)
fig.update_xaxes(side="top")
fig.show()
HeatPlot1(data, dow_names)
The trip duration heatmap is plotted by moments on 24 hours in a day/weekdays. The most extended trips commonly occur between 2 to 4 PM from Wed to Fri. The most diminutive trips happen between 8 to 9 AM on Monday and Tuesday.
df = dd.read_parquet('./nyc_taxi.parq/').persist()
## understand the passsenger location pickup.
print(len(df))
df.head(2)
## the style of sns.
sns.set(style="white")
# Generate a large random dataset
temp3 = train.copy()
# Compute the correlation matrix
corr = temp3.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(15, 13))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
Look we have some straignt correlation such as Passenger_count, and pick_longitute. It indicates a linear relationship exists between them.
There is a weak positive correlation between the longitude variables and trip_duration. There is also a weak negative correlation betwen the latitude variables and trip_duration.
Otherwise, there doesn't appear to be much a of a linear relationship between our target variable and the remaining features.
df_plot = df_train.pivot_table('fare_amount', index='pickup_datetime_day_of_hour', columns='pickup_datetime_year')
sns.set_style("whitegrid", {'axes.grid' : False})
df_plot.plot(figsize=(14,6), cmap="plasma")
plt.ylabel('Fare $USD');
plt.xlabel('Hour of the days');
We can see the fair $ for hour during 24 hours in Difference year around 5 AM it very high density number. Also a bit rasing after 3 Pm. Morning here observed as the rush hours.
train['log_trip_duration'] = np.log1p(train['trip_duration'].values)
## Add log trip for trip duration.
fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(12,8))
fig.suptitle('Train trip duration and log of trip duration')
ax1.legend(loc=0)
ax1.set_ylabel('count')
ax1.set_xlabel('trip duration')
ax2.set_xlabel('log(trip duration)')
ax2.legend(loc=0)
ax1.hist(train.trip_duration,color='black',bins=7)
ax2.hist(train.log_trip_duration,bins=50,color='Green');
Log trip being good in Normalization Data. After transfer from trip duration and visualize the log trip duration.
print('Old size: %d' % len(df_train))
df_train = df_train[df_train.fare_amount>=0]
print('New size: %d' % len(df_train))
df_train[df_train.fare_amount<100].fare_amount.hist(bins=100, figsize=(14,3), color="blue")
plt.xlabel('fare $USD')
plt.title('Histogram');
Display the plot of fare amount <100 $ USD. All data bar chart focus on understand how trip fair going on. As we can see the highest is > 6000 trip.
df_train['distance_miles'] = distance(df_train.pickup_latitude, df_train.pickup_longitude, \
df_train.dropoff_latitude, df_train.dropoff_longitude)
df_train['fare_per_mile'] = df_train.fare_amount / df_train.distance_miles
# scatter plot distance - fare we only see the total one.
fig, axs = plt.subplots(1, 2, figsize=(16,6))
axs[0].scatter(df_train.distance_miles, df_train.fare_amount, alpha=0.2, color='blue')
axs[0].set_xlabel('Distance Mile')
axs[0].set_ylabel('Fare $USD')
axs[0].set_title('All data in big scope')
# zoom in on part of data into the small
idx = (df_train.distance_miles < 15) & (df_train.fare_amount < 100)
axs[1].scatter(df_train[idx].distance_miles, df_train[idx].fare_amount, alpha=0.2, color='green')
axs[1].set_xlabel('distance mile')
axs[1].set_ylabel('fare $USD')
axs[1].set_title('Zoom in on distance < 15 mile, fare < $100');
We choose the small row of Data then we only have one row near the green one. All the data fare-amount and distance seem to be linear regression.
nyc = (-74.0063889, 40.7141667)
### Location limited in NYC map
df_train['distance_to_center'] = distance(nyc[1], nyc[0], df_train.pickup_latitude, df_train.pickup_longitude)
###
fig, axs = plt.subplots(1, 2, figsize=(19, 15))
im = axs[0].scatter(df_train.distance_to_center, df_train.distance_miles, c=np.clip(df_train.fare_amount, 0, 100),
cmap='plasma', alpha=1.0, s=1)
axs[0].set_xlabel('pickup distance from NYC center')
axs[0].set_ylabel('distance miles')
axs[0].set_title('All data')
cbar = fig.colorbar(im, ax=axs[0])
cbar.ax.set_ylabel('fare_amount', rotation=270)
### Choose the trip to center < 15
idx = (df_train.distance_to_center < 15) & (df_train.distance_miles < 35)
im = axs[1].scatter(df_train[idx].distance_to_center, df_train[idx].distance_miles,
c=np.clip(df_train[idx].fare_amount, 0, 100), cmap='plasma', alpha=1.0, s=1)
axs[1].set_xlabel('pickup distance from NYC center')
axs[1].set_ylabel('distance miles')
axs[1].set_title('Zoom in')
cbar = fig.colorbar(im, ax=axs[1])
cbar.ax.set_ylabel('fare_amount', rotation=270)
There is a lot of 'purple pink' dots, which is about $50 to \$60 fare amount near 13 miles distance of NYC center of distrance of trip. This could be due to trips from/to JFK airport
It mainly considered the total distance of a trip as a main feature for predicting the fare amount. However, what about the direction of a trip? To visualise this, the wwork start with a simple plot of the delta longitude and latitude and the fare amount.
def select_within_boundingbox(df, BB):
return (df.pickup_longitude >= BB[0]) & (df.pickup_longitude <= BB[1]) & \
(df.pickup_latitude >= BB[2]) & (df.pickup_latitude <= BB[3]) & \
(df.dropoff_longitude >= BB[0]) & (df.dropoff_longitude <= BB[1]) & \
(df.dropoff_latitude >= BB[2]) & (df.dropoff_latitude <= BB[3])
## Support function for select_with_bounding
## Support to make the bouding box 2 data frame.
df_train['delta_lon'] = df_train.pickup_longitude - df_train.dropoff_longitude
df_train['delta_lat'] = df_train.pickup_latitude - df_train.dropoff_latitude
from planar import BoundingBox
# Select trips in Manhattan
BB_manhattan = (-74.025, -73.925, 40.7, 40.8)
idx_manhattan = select_within_boundingbox(df_train, BB_manhattan)
plt.figure(figsize=(14,8))
plt.scatter(df_train[idx_manhattan].delta_lon, df_train[idx_manhattan].delta_lat, s=0.7, alpha=1.0,
c=np.log1p(df_train[idx_manhattan].fare_amount), cmap='inferno')
plt.colorbar()
plt.xlabel('Pickup_longitude - dropoff_longitude')
plt.ylabel('Pickup_latitude - dropoff_latidue')
plt.title('log 1p (fare_amount)');
After this plot we are focus to understand trip duration.
import seaborn as sns
sns.set(style="whitegrid", palette="pastel", color_codes=True)
sns.set_context("poster")
train2 = train.copy()
train2['trip_duration']= np.log(train2['trip_duration'])
sns.violinplot(x="passenger_count", y="trip_duration", hue="vendor_id", data=train2, split=True,
inner="quart",palette={1: "g", 2: "r"})
sns.despine(left=True)
sns.set(rc={'figure.figsize':(15,6)})
print(train2.shape[0])
For passsenger 0 It is not understanding all why the trip_duration getting under zero. For some assumption. Trips with zero passengers can be trips when a taxi is called to a particular location and the customer is charged to get in the taxi cab in there. Consideration of possible answer. Distributions are similar for both the vendors, all vendor with passenger count 1,2,3,4 are fair
hv.extension('bokeh')
points = hv.Points(df, kdims=['pickup_x', 'pickup_y'], vdims=['passenger_count'])
options = dict(width=800,height=475,xaxis=None,yaxis=None,bgcolor='black',show_grid=False)
taxi_trips = datashade(points, x_sampling=1, y_sampling=1, cmap=cm['fire']).opts(plot=options)
taxi_trips
tiles = gv.WMTS(WMTSTileSource(url='https://server.arcgisonline.com/ArcGIS/rest/services/'
'World_Imagery/MapServer/tile/{Z}/{Y}/{X}.jpg'))
tiles * taxi_trips
Let show the overlaying it on real map.
This work has follow the reference from Scipy Packge it help us get some more choose to zoome and plot the meta data point of Pickup and dropoff point. We can see how the passenger start the trip. And the central of Mahattan is extremely busy.
class TaxDataEDA(hv.streams.Stream):
alpha = param.Magnitude(default=0.75, doc="Alpha value for the map opacity")
plot = param.ObjectSelector(default="pickup", objects=["pickup","dropoff"])
colormap = param.ObjectSelector(default=cm["fire"], objects=cm.values())
passengers = param.Range(default=(0, 10), bounds=(0, 10), doc="""
Filter for taxi trips by number of passengers""")
#TaxiData EDA function.
TaxDataEDA.alpha
# Test Taxi Alpha
TaxDataEDA.alpha = 0.5
TaxDataEDA
## This parameter help the NYCTaxidata access the goal and the capacity of map
try:
TaxDataEDA.alpha = '0'
except Exception as e:
print(e)
# Get A Widget-based UI from the map
explorer = TaxDataEDA(alpha=0.6)
explorer.alpha
TaxDataEDA.passengers
class TaxDataEDA(hv.streams.Stream):
alpha = param.Magnitude(default=0.75, doc="Alpha value for the map opacity")
colormap = param.ObjectSelector(default=cm["fire"], objects=cm.values())
plot = param.ObjectSelector(default="pickup", objects=["pickup","dropoff"])
passengers = param.Range(default=(1, 9), bounds=(1, 9))
def make_view(self, x_range=None, y_range=None, **kwargs):
map_tiles = tiles.opts(style=dict(alpha=self.alpha), plot=options)
points = hv.Points(df, kdims=[self.plot+'_x', self.plot+'_y'], vdims=['passenger_count'])
selected = points.select(passenger_count=self.passengers)
taxi_trips = datashade(selected, x_sampling=1, y_sampling=1, cmap=self.colormap,
dynamic=False, x_range=x_range, y_range=y_range,
width=800, height=475)
return map_tiles * taxi_trips
# This plot use Bokeh Package to plot the data from the map.
explorer = TaxDataEDA(alpha=0.2, plot="dropoff")
explorer.make_view()
explorer = TaxDataEDA()
paramnb.Widgets(explorer, callback=explorer.event)
hv.DynamicMap(explorer.make_view, streams=[explorer, RangeXY()])
## Plot choose right data. ( latitude , longitude)
data = [go.Scattermapbox(
lat= df_train5k['pickup_latitude'] ,
lon= df_train5k['pickup_longitude'],
mode='markers',
marker=dict(
size= 4,
color = 'white',
opacity = .8,
),
)]
## set the layout
layout = go.Layout(autosize=False,
mapbox= dict(accesstoken="pk.eyJ1Ijoic2hhejEzIiwiYSI6ImNqYXA3NjhmeDR4d3Iyd2w5M2phM3E2djQifQ.yyxsAzT94VGYYEEOhxy87w",
bearing=10,
pitch=60,
zoom=13,
center= dict(
lat=40.721319,
lon=-73.987130),
style= "mapbox://styles/shaz13/cjiog1iqa1vkd2soeu5eocy4i"),
width=900,
height=600, title = "Pick up Locations in NewYork")
fig = dict(data=data, layout=layout)
offline.iplot(fig)
Whole of pickup location are 3D Dark Night plot help the customer easy to knowwhere the pickup location located. With plotly you can also interactively zoom in and out. Try it out and explore the bird eye view of the data points. To get familiar with the plots again, lets plot the drop off location using another styled map. MapBox provides many styles, you can also makes custom maps in their studio platform.
data = [go.Scattermapbox(
lat= df_train5k['dropoff_latitude'] ,
lon= df_train5k['dropoff_longitude'],
mode='markers',
marker=dict(
size= 4,
color = 'cyan',
opacity = .8,
),
)]
## Setting the layout for map.
layout = go.Layout(autosize=False,
mapbox= dict(accesstoken="pk.eyJ1Ijoic2hhejEzIiwiYSI6ImNqYXA3NjhmeDR4d3Iyd2w5M2phM3E2djQifQ.yyxsAzT94VGYYEEOhxy87w",
bearing=10,
pitch=60,
zoom=13,
center= dict(
lat=40.721319,
lon=-73.987130),
style= "mapbox://styles/shaz13/cjk4wlc1s02bm2smsqd7qtjhs"),
width=900,
height=600, title = "Drop off locations in Newyork")
fig = dict(data=data, layout=layout)
offline.iplot(fig)
# use bigger rows due to big data to see it clearly.
west, south, east, north = -74.03, 40.63, -73.77, 40.85
train = train[(train.pickup_latitude> south) & (train.pickup_latitude < north)]
train = train[(train.dropoff_latitude> south) & (train.dropoff_latitude < north)]
train = train[(train.pickup_longitude> west) & (train.pickup_longitude < east)]
train = train[(train.dropoff_longitude> west) & (train.dropoff_longitude < east)]
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True,figsize=(15,10))
ax1.grid(False)
ax2.grid(False)
train.plot(kind='scatter', x='pickup_longitude', y='pickup_latitude',
color='yellow',
s=.02, alpha=.6, subplots=True, ax=ax1, grid=False)
ax1.set_title("Pickups")
ax1.set_facecolor('black')
train.plot(kind='scatter', x='dropoff_longitude', y='dropoff_latitude',
color='lightblue',
s=.02, alpha=.6, subplots=True, ax=ax2, grid=False)
ax2.set_title("Dropoffs")
ax2.set_facecolor('black')
As we can see, the heavy travel point is from locating in manhattan center which the most lighten area. And the airport, which is a small place outside of the city. With this plot, we will know where to maximize the trip recommendation in the model.
We have plot the huge wave of data analysis. Therefore all it gain the strength to understand data from new york taxi. We analyze there are 2 things to do. 1st Maximize the trip make it shorter, 2nd improve the cost of taxi for customer. After this EDA the work come further to analyze the model by apply XGBOOSt and Random Forest in next work.